import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier #Import Random Forest Model
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
from collections import Counter
from sklearn.metrics import classification_report
import seaborn as sns
# let's prep lists to hang on to our result data
list_ratings = ['1.0 Stars', '1.5 Stars', '2.0 Star', '2.5 Stars', '3.0 Stars', '3.5 Stars', '4.0 Stars']
list_measures= ['Rating', 'Model', 'Precision', 'Recall', 'F1-Score', 'Support']
list_results = []
list_accuracy = []
list_avgs = []
# load cleaned project data
df= pd.read_csv('resources/data/epicurious_data_cleaned_updated.csv')
# drop columns we don't need
df.drop(['tags', 'date', 'desc', 'directions', 'ingredients', 'rating', 'title'],
axis = 1, inplace=True)
# set the rating_normalized to string for model
df['rating_normalized'] = df['rating_normalized'].astype(str)
# reset index after dropping dta
df.reset_index(drop=True, inplace=True)
# rename the rating_normalized to "label" which is the known classificer
df.rename(columns={'rating_normalized':'label'}, inplace=True)
df.head()
We split the data set as 70% training and 30% testing and set the stratify parameter to ensure equal distribution of the classifications. This train and test data set was used for the next iterations of decision tree and random forest models.
#split dataset in features and target variable
feature_cols = ['calories', 'fat', 'protein', 'sodium', 'ingredients_count', 'tags_count', 'age']
X = df[feature_cols] # Features
y = df.label # Target variable
# Split dataset into training set and test set
# 70% training and 30% test
# random state set to keep same split across computers
# stratify to ensure even split of categories across train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)
print(f'Train {Counter(y_train)}')
print(f'Test {Counter(y_test)}')
First, we began with a single decision tree. For this model, we limited the max depth to 4 and predicted scores on the test data.
# Create Decision Tree classifer object
# use max depth of 4 for easy visualization
clf_dt = DecisionTreeClassifier(max_depth=4)
# Train Decision Tree Classifer
clf_dt = clf_dt.fit(X_train,y_train)
#Predict the response for test dataset
y_pred_dt = clf_dt.predict(X_test)
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred_dt))
print("")
# classification report
print("Decison Tree Classification Report")
print(classification_report(y_pred_dt, y_test))
# brute force! capture results of near miss and add to list_results
list_dt = [{list_measures[0]: list_ratings[0], list_measures[1]: 'Decision Tree', list_measures[2]: .00, list_measures[3]: .00, list_measures[4]:.00, list_measures[5]: 0},
{list_measures[0]: list_ratings[1],list_measures[1]: 'Decision Tree', list_measures[2]: .00, list_measures[3]: .00, list_measures[4]:.00, list_measures[5]: 0},
{list_measures[0]: list_ratings[2],list_measures[1]: 'Decision Tree', list_measures[2]: .00, list_measures[3]: .00, list_measures[4]:.00, list_measures[5]: 0},
{list_measures[0]: list_ratings[3],list_measures[1]: 'Decision Tree', list_measures[2]: .00, list_measures[3]: .00, list_measures[4]:.00, list_measures[5]: 0},
{list_measures[0]: list_ratings[4],list_measures[1]: 'Decision Tree', list_measures[2]: .00, list_measures[3]: .50, list_measures[4]:.01, list_measures[5]: 6},
{list_measures[0]: list_ratings[5],list_measures[1]: 'Decision Tree', list_measures[2]: .93, list_measures[3]: .48, list_measures[4]:.64, list_measures[5]: 2404},
{list_measures[0]: list_ratings[6],list_measures[1]: 'Decision Tree', list_measures[2]: .32, list_measures[3]: .43, list_measures[4]:.37, list_measures[5]: 305}]
list_results.extend(list_dt)
list_accuracy.extend([{'Model':'Decision Tree', 'Accuracy': 0.47661141804788215}])
list_avgs.extend([{'Model': 'Decision Tree', 'Avg Precision': 0.86, 'Avg Recall': }])
A single decision tree has an accuracy rate of 48%. The classification report shows we only had predictions for 3.0, 3.5 and 4.0 star recipes. The decision tree is illustrated in the visualization below.
from sklearn.externals.six import StringIO
from IPython.display import Image
from sklearn.tree import export_graphviz
import pydotplus
dot_data = StringIO()
export_graphviz(clf_dt, out_file=dot_data,
filled=True, rounded=True,
special_characters=True, feature_names = feature_cols,class_names=clf_dt.classes_)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png('decision-tree-viz.png')
Image(graph.create_png())
To increase accuracy, we next used a Random Forest Model. Rather than a single decision tree, our model used 100 trees to predict recipe ratings. In this model, our average depth of the trees was 33 with 4,185 average number of nodes.
#Create a random forest classifier using 100 trees
clf_rf=RandomForestClassifier(n_estimators=100)
#Train the model using the training sets y_pred=clf.predict(X_test)
clf_rf.fit(X_train,y_train)
#Predict the response for test dataset
y_pred_rf = clf_rf.predict(X_test)
# https://towardsdatascience.com/an-implementation-and-explanation-of-the-random-forest-in-python-77bf308a9b76
n_nodes = []
max_depths = []
# Stats about the trees in random forest
for ind_tree in clf_rf.estimators_:
n_nodes.append(ind_tree.tree_.node_count)
max_depths.append(ind_tree.tree_.max_depth)
print(f'Average number of nodes {int(np.mean(n_nodes))}')
print(f'Average maximum depth {int(np.mean(max_depths))}')
acc = clf_rf.score(X_test, y_test) # only score on the test
print(f"Accuracy = {acc}")
print("")
# classification report
print("Random Forest Classification Report")
print(classification_report(y_pred_rf, y_test))
# brute force! capture results of near miss and add to list_results
list_rf = [{list_measures[0]: list_ratings[0], list_measures[1]: 'Random Forest', list_measures[2]: .17, list_measures[3]: 1.00, list_measures[4]:.30, list_measures[5]: 4},
{list_measures[0]: list_ratings[1],list_measures[1]: 'Random Forest', list_measures[2]: .10, list_measures[3]: .50, list_measures[4]:.17, list_measures[5]: 2},
{list_measures[0]: list_ratings[2],list_measures[1]: 'Random Forest', list_measures[2]: .06, list_measures[3]: .67, list_measures[4]:.10, list_measures[5]: 6},
{list_measures[0]: list_ratings[3],list_measures[1]: 'Random Forest', list_measures[2]: .12, list_measures[3]: .59, list_measures[4]:.20, list_measures[5]: 44},
{list_measures[0]: list_ratings[4],list_measures[1]: 'Random Forest', list_measures[2]: .29, list_measures[3]: .43, list_measures[4]:.34, list_measures[5]: 499},
{list_measures[0]: list_ratings[5],list_measures[1]: 'Random Forest', list_measures[2]: .78, list_measures[3]: .53, list_measures[4]:.63, list_measures[5]: 1835},
{list_measures[0]: list_ratings[6],list_measures[1]: 'Random Forest', list_measures[2]: .42, list_measures[3]: .52, list_measures[4]:.47, list_measures[5]: 325}]
list_results.extend(list_rf)
list_accuracy.extend([{'Model':'Random Forest', 'Accuracy': 0.5119705340699816}])
# create matrix
# https://stats.stackexchange.com/questions/95209/how-can-i-interpret-sklearn-confusion-matrix
df_precision = pd.crosstab(y_test, y_pred_rf, rownames=['Actual Ratings'], colnames=['Predicted Ratings'], margins=True)
# of recipes predicted correctly
# true predicted / actual true values gives precision
# Plot number of recipes predicted correctly
cm = sns.light_palette("teal", as_cmap=True)
df_precision.style.set_caption('# of Recipes Predicted Correctly')\
.background_gradient(cmap=cm)
Accuracy for the Random Forest increased to 51%. A slight increase - let's review the classification report to see how the data shook out.
Precision is a measure of recipes predicted correctly compared to the total number of actual ratings. The classification report shows the model has the best precision predicting 3.5 star recipes (78%) with poor to extremely poor precision predicting other star ratings (6%-42%). The matrix shows 971 recipes were predicted 3.5 stars correctly out of a total of 1,250 actual 3.5 star recipes. The worst precision was 2 star recipes with only 4 out of 71 predicted correctly.
Recall is a measure of recipes predicted correctly compared to the total number of all predicted ratings. The classification report shows higher recall for 1, 2 and 2.5 star ratings than the others. Once again referring to the matrix, we see 2 star ratings had 4 recipes predicted correctly out of a total of 6 predictions while the 3.5 star ratings had 977 predicted correctly out of 1,835 predictions.
The F score represents the mean average of both precision and recall in the model. The support values represent the total number of recipes predicted for each star rating.
The classification report really shows how our imbalanced data affected the accuracy of the model. Classifications with a higher number of records had better precision scores while those with lower numbers had better recall scores. The best f-score of the bunch was for 3.5 star ratings (63%) which is fairly poor. Revisions to the model will address the imbalance of data.
Let's quantify the usefullness of the features provided by reviewing their relative importance in predicting values. This model found sodium and calories to be the most important features when predicting star ratings however the measures are similar across all features.
# review feature importance
feature_imp = pd.Series(clf_rf.feature_importances_,index=feature_cols).sort_values(ascending=False)
# feature_imp
# visualize feature importance
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
# Creating a bar plot
sns.barplot(x=feature_imp, y=feature_imp.index)
# Add labels to your graph
plt.xlabel('Feature Importance Score')
plt.ylabel('Features')
plt.title("Visualizing Important Features")
# plt.legend()
plt.show()
Given our large number of nodes and depth, it's difficult to plot a visualization of a single tree from the forest.
# plot a single decision tree4
from sklearn.externals.six import StringIO
from IPython.display import Image
# Import tools needed for visualization
from sklearn.tree import export_graphviz
# import pydot
# Pull out one tree from the forest
tree = clf_rf.estimators_[5]
# Import tools needed for visualization
from sklearn.tree import export_graphviz
import pydotplus
# Pull out one tree from the forest
tree = clf_rf.estimators_[5]
dot_data = StringIO()
export_graphviz(tree, out_file=dot_data,
filled=True, rounded=True,
special_characters=True, feature_names = feature_cols)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png('random-forest-tree-viz.png')
Image(graph.create_png())